2.1 Data Cleaning and Preparation
ecls <- ecls_raw[, .(
childid = childid,
wave = wave,
# time variable (0, 1, 2)
time = fcase(
wave == 2, 0, # Spring Kindergarten (baseline)
wave == 4, 1, # Spring 1st Grade
wave == 9, 2, # Spring 5th Grade
default = NA_real_
),
# Outcomes
math = math_score,
science = science_score,
# Food security variables
fs_raw = fs_raw,
fs_scale = fs_scale,
fs_status = fs_status,
# Demographics (time-invariant)
sex = x_chsex_r,
race = X_RACETHP_R,
# SES (baseline)
ses_baseline = x12sesl,
# School characteristics
school_type = school_type,
urbanicity = locale,
# Additional controls
household_size = household_size,
disability = disability
)]
# Handle food security scale scores
ecls[fs_scale == -6, fs_scale := 1.4] # Recode food secure
ecls[fs_scale < -6, fs_scale := NA_real_]
ecls[fs_raw < 0, fs_raw := NA_real_]
ecls[math < 0 | is.na(math), math := NA_real_]
ecls[science < 0 | is.na(science), science := NA_real_]
# Baseline food security variable
ecls[, fs_baseline := fs_scale[wave == 2][1], by = childid]
ecls[, fs_status_baseline := fs_status[wave == 2][1], by = childid]
ecls[is.na(fs_baseline), fs_baseline := fs_scale[!is.na(fs_scale)][1], by = childid]
ecls[is.na(fs_status_baseline), fs_status_baseline := fs_status[!is.na(fs_status)][1], by = childid]
# Food security change variable
ecls[, fs_change := fs_scale - fs_baseline]
# Cumulative exposure variable
ecls[, fs_insecure := as.numeric(fs_status %in% c(2, 3))]
ecls[, fs_cumulative := cumsum(replace(fs_insecure, is.na(fs_insecure), 0)), by = childid]
# Create SES quartiles for moderation analysis
ecls[, ses_quartile := cut(ses_baseline,
breaks = quantile(ses_baseline, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE),
labels = c("Q1_Lowest", "Q2", "Q3", "Q4_Highest"),
include.lowest = TRUE)]
# Convert sex to factor
ecls[, sex := gsub(":.*", "", sex)]
ecls[sex == "1", sex := "Male"]
ecls[sex == "2", sex := "Female"]
ecls[, sex := factor(sex, levels = c("Male", "Female"))]
# Convert fs_status to factor
ecls[, fs_status_factor := factor(fs_status, levels = 1:3,
labels = c("High/Marginal", "Low", "Very Low"))]
# Clean disability
ecls[, disability_clean := gsub(":.*", "", disability)]
ecls[disability_clean == "1", disability_clean := "Yes"]
ecls[disability_clean == "2", disability_clean := "No"]
ecls[, disability := factor(disability_clean, levels = c("Yes", "No"))]
ecls[, disability_clean := NULL]
setorder(ecls, childid, time)
2.2 Enhanced Categorical Variable Cleaning
# Create working copy for GEE analysis
gee_data_clean <- copy(ecls)
# Race: Collapse small categories
gee_data_clean[, race_simple := fcase(
race %in% c("1", "1:WHITE, NON-HISPANIC"), "White",
race %in% c("2", "2:BLACK OR AFRICAN AMERICAN, NON-HISPANIC"), "Black",
race %in% c("3", "3:HISPANIC, RACE SPECIFIED"), "Hispanic",
race %in% c("4", "4:HISPANIC, RACE NOT SPECIFIED"), "Hispanic",
race %in% c("5", "5:ASIAN"), "Asian",
default = "Other"
)]
gee_data_clean[, race_simple := factor(race_simple,
levels = c("White", "Black", "Hispanic", "Asian", "Other"))]
# School type: Simplify
gee_data_clean[, school_simple := fcase(
grepl("PUBLIC", school_type, ignore.case = TRUE), "Public",
grepl("PRIVATE|CATHOLIC", school_type, ignore.case = TRUE), "Private",
default = "Other"
)]
gee_data_clean[, school_simple := factor(school_simple, levels = c("Public", "Private", "Other"))]
# Urbanicity: Simplify
gee_data_clean[, urban_simple := fcase(
grepl("CITY|URBAN", urbanicity, ignore.case = TRUE), "Urban",
grepl("SUBURB", urbanicity, ignore.case = TRUE), "Suburban",
grepl("TOWN|RURAL", urbanicity, ignore.case = TRUE), "Rural",
default = "Other"
)]
gee_data_clean[, urban_simple := factor(urban_simple,
levels = c("Urban", "Suburban", "Rural", "Other"))]
cat("\n", rep("=", 80), "\n", sep="")
##
## ================================================================================
cat("SIMPLIFIED CATEGORICAL VARIABLES\n")
## SIMPLIFIED CATEGORICAL VARIABLES
cat(rep("=", 80), "\n\n", sep="")
## ================================================================================
cat("Race Distribution:\n")
## Race Distribution:
print(table(gee_data_clean$race_simple, useNA = "ifany"))
##
## White Black Hispanic Asian Other
## 24078 6456 12972 4230 6786
cat("\nSchool Type Distribution:\n")
##
## School Type Distribution:
print(table(gee_data_clean$school_simple, useNA = "ifany"))
##
## Public Private Other
## 39888 2817 11817
cat("\nUrbanicity Distribution:\n")
##
## Urbanicity Distribution:
print(table(gee_data_clean$urban_simple, useNA = "ifany"))
##
## Urban Suburban Rural Other
## 14576 16600 12514 10832
cat("\nDisability Distribution:\n")
##
## Disability Distribution:
print(table(gee_data_clean$disability, useNA = "ifany"))
##
## Yes No <NA>
## 5761 29041 19720
cat("\nHousehold Size Summary:\n")
##
## Household Size Summary:
print(summary(gee_data_clean$household_size))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.000 4.000 4.000 4.644 5.000 18.000 17823
cat("\n", rep("=", 80), "\n", sep="")
##
## ================================================================================
cat("DATA PREPARATION COMPLETE\n")
## DATA PREPARATION COMPLETE
cat(rep("=", 80), "\n", sep="")
## ================================================================================
cat("Observations after cleaning:", nrow(gee_data_clean), "\n")
## Observations after cleaning: 54522
cat("Children with at least one observation:", uniqueN(gee_data_clean$childid), "\n")
## Children with at least one observation: 18174
# Sample sizes by wave
sample_sizes <- gee_data_clean[, .(
n_children = uniqueN(childid),
n_with_math = sum(!is.na(math)),
n_with_science = sum(!is.na(science)),
n_with_fs = sum(!is.na(fs_scale)),
pct_complete = round(100 * sum(!is.na(math) & !is.na(fs_scale)) / uniqueN(childid), 1)
), by = wave]
print(kable(sample_sizes, caption = "Sample Sizes by Wave", digits = 1))
##
##
## Table: Sample Sizes by Wave
##
## | wave| n_children| n_with_math| n_with_science| n_with_fs| pct_complete|
## |----:|----------:|-----------:|--------------:|---------:|------------:|
## | 2| 18174| 17143| 16936| 12910| 68.7|
## | 4| 18174| 15103| 15072| 12313| 65.2|
## | 9| 18174| 11426| 11419| 9308| 46.9|
# Outcome means by wave
outcome_means <- gee_data_clean[, .(
Math_Mean = round(mean(math, na.rm = TRUE), 2),
Math_SD = round(sd(math, na.rm = TRUE), 2),
Science_Mean = round(mean(science, na.rm = TRUE), 2),
Science_SD = round(sd(science, na.rm = TRUE), 2),
FS_Scale_Mean = round(mean(fs_scale, na.rm = TRUE), 2),
FS_Scale_SD = round(sd(fs_scale, na.rm = TRUE), 2)
), by = wave]
cat("\n")
print(kable(outcome_means, caption = "Outcome Variables by Wave", digits = 2))
##
##
## Table: Outcome Variables by Wave
##
## | wave| Math_Mean| Math_SD| Science_Mean| Science_SD| FS_Scale_Mean| FS_Scale_SD|
## |----:|---------:|-------:|------------:|----------:|-------------:|-----------:|
## | 2| 49.86| 13.34| 33.48| 7.38| 1.93| 1.40|
## | 4| 72.25| 15.73| 42.36| 10.36| 1.88| 1.36|
## | 9| 119.66| 17.79| 73.17| 13.04| 1.74| 1.18|